import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import nltk
import gensim
import re
import os
import string
import torch
import joblib
import sys
import random
import warnings
warnings.filterwarnings("ignore")
from transformers import BertConfig, AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix , classification_report, accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import roc_curve
from yellowbrick.classifier import ROCAUC
from wordcloud import WordCloud
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from gensim.models import Word2Vec
from collections import Counter
from tqdm import tqdm
from pprint import pprint
from configparser import ConfigParser
#train_path = 'Project Datasets/intent classification/is_train.json'
#val_path = 'Project Datasets/intent classification/is_val.json'
#test_path = 'Project Datasets/intent classification/is_test.json'
#train = json.load(open(train_path,'r'))
#val = json.load(open(val_path,'r'))
#test = json.load(open(test_path,'r'))
#train = json.load(open(train_path,'r'))
#val = json.load(open(val_path,'r'))
#test = json.load(open(test_path,'r'))
# Reading Config file
file = "config.ini"
config = ConfigParser()
config.read(file,encoding='utf-8')
['config.ini']
filepaths = dict(config.items('FilePaths'))
# Reading train json file
train = json.load(open(filepaths['train_path'], 'r'))
# Reading validation json file
val = json.load(open(filepaths['val_path'], 'r'))
# Reading test json file
test = json.load(open(filepaths['test_path'], 'r'))
len(val)
3000
len(test)
4500
len(train)
15000
# Function to iterate through json files and create dataframes
def createDataframe(json_var):
tempList = []
for item in json_var:
tempList.append({'text':item[0], 'intent':item[1]})
tempdf = pd.DataFrame(tempList)
return tempdf
train_df = createDataframe(train)
test_df = createDataframe(test)
val_df = createDataframe(val)
train_df
| text | intent | |
|---|---|---|
| 0 | what expression would i use to say i love you ... | translate |
| 1 | can you tell me how to say 'i do not speak muc... | translate |
| 2 | what is the equivalent of, 'life is good' in f... | translate |
| 3 | tell me how to say, 'it is a beautiful morning... | translate |
| 4 | if i were mongolian, how would i say that i am... | translate |
| ... | ... | ... |
| 14995 | can you explain why my card was declined | card_declined |
| 14996 | how come starbucks declined my card when i tri... | card_declined |
| 14997 | how come my card was not accepted yesterday | card_declined |
| 14998 | find out what happened to make my card get dec... | card_declined |
| 14999 | why was my card declined at safeway | card_declined |
15000 rows × 2 columns
train_df.intent.value_counts()
translate 100
order_status 100
goodbye 100
account_blocked 100
what_song 100
...
reminder 100
change_speed 100
tire_pressure 100
no 100
card_declined 100
Name: intent, Length: 150, dtype: int64
train_df.intent.nunique()
150
val_df.intent.value_counts()
translate 20
order_status 20
goodbye 20
account_blocked 20
what_song 20
..
reminder 20
change_speed 20
tire_pressure 20
no 20
card_declined 20
Name: intent, Length: 150, dtype: int64
test_df.intent.value_counts()
translate 30
order_status 30
goodbye 30
account_blocked 30
what_song 30
..
reminder 30
change_speed 30
tire_pressure 30
no 30
card_declined 30
Name: intent, Length: 150, dtype: int64
# Checking if any text from any of the files (train, test, val) has email, url, mention and hashtags
def ngrams_top(corpus,ngram_range,n=10):
vec = CountVectorizer(stop_words = 'english',ngram_range=ngram_range).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
total_list=words_freq[:n]
df=pd.DataFrame(total_list,columns=['text','count'])
return df
def get_emails(x):
email = re.findall(r'[\w\.-]+@[\w-]+\.[\w]+',str(x))
return " ".join(email)
def get_urls(x):
url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.[\w]+',str(x))
return " ".join(url)
def get_mentions(x):
mention = re.findall(r'(?<=@)\w+',str(x))
return " ".join(mention)
def get_hashtags(x):
hashtag = re.findall(r'(?<=#)\w+',str(x))
return " ".join(hashtag)
def text_at_a_glance(df):
res = df.apply(get_emails)
res = res[res.values!=""]
print("Data has {} rows with emails".format(len(res)))
res = df.apply(get_urls)
res = res[res.values!=""]
print("Data has {} rows with urls".format(len(res)))
res = df.apply(get_mentions)
res = res[res.values!=""]
print("Data has {} rows with mentions".format(len(res)))
res = df.apply(get_hashtags)
res = res[res.values!=""]
print("Data has {} rows with hashtags".format(len(res)))
train_df.text
0 what expression would i use to say i love you ...
1 can you tell me how to say 'i do not speak muc...
2 what is the equivalent of, 'life is good' in f...
3 tell me how to say, 'it is a beautiful morning...
4 if i were mongolian, how would i say that i am...
...
14995 can you explain why my card was declined
14996 how come starbucks declined my card when i tri...
14997 how come my card was not accepted yesterday
14998 find out what happened to make my card get dec...
14999 why was my card declined at safeway
Name: text, Length: 15000, dtype: object
text_at_a_glance(train_df.text)
Data has 0 rows with emails Data has 0 rows with urls Data has 0 rows with mentions Data has 1 rows with hashtags
# Checking where the hashtag is present in the train set, punctuations will be taken care of by vectorizer
temp = train_df.text.apply(get_hashtags)
train_df.iloc[temp[temp.values!=""].index].text
13644 put mambo #5 by lou bega on my playlist, please Name: text, dtype: object
text_at_a_glance(val_df.text)
Data has 0 rows with emails Data has 0 rows with urls Data has 0 rows with mentions Data has 0 rows with hashtags
text_at_a_glance(test_df.text)
Data has 0 rows with emails Data has 0 rows with urls Data has 0 rows with mentions Data has 0 rows with hashtags
# Tokenization
train_df['tokenized_text'] = train_df['text'].apply(nltk.tokenize.word_tokenize)
# Removing punctuations from the column 'text' and text lowercase conversion
train_df['cleaned_text'] = train_df['text'].apply(lambda i :"".join(t.lower() for t in i if t not in string.punctuation))
# Cleaned text tokenization
train_df['cleaned_tokenized_text'] = train_df['cleaned_text'].apply(nltk.tokenize.word_tokenize)
# Loading all english stopwords into a set
en_stops = set(stopwords.words('english'))
# Removing stopwords from cleaned tokens
train_df['cleaned_tokenized_text'] = train_df['cleaned_tokenized_text'].apply(lambda x: [i for i in x if i not in en_stops])
# Calculating word frequencies
words = []
for each in train_df.cleaned_tokenized_text:
words.extend(each)
word_freq = Counter(words)
# Sorting the counter variable in descending order of counts
sorted_word_freq = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1], reverse=True)}
# Word frequencies in descending order for the whole dataframe
sorted_word_freq
{'need': 1307,
'please': 1144,
'card': 1137,
'tell': 998,
'know': 843,
'get': 749,
'credit': 685,
'whats': 591,
'account': 567,
'list': 530,
'car': 508,
'want': 507,
'like': 506,
'would': 429,
'change': 409,
'new': 382,
'bank': 379,
'time': 376,
'many': 360,
'much': 352,
'go': 351,
'find': 349,
'make': 336,
'take': 315,
'long': 315,
'name': 307,
'next': 305,
'help': 304,
'bill': 295,
'set': 284,
'oil': 268,
'visa': 265,
'use': 259,
'im': 245,
'reservation': 239,
'phone': 227,
'pay': 227,
'call': 223,
'shopping': 222,
'last': 220,
'flight': 219,
'song': 215,
'vacation': 212,
'date': 207,
'number': 207,
'good': 206,
'put': 203,
'gas': 201,
'one': 195,
'let': 193,
'add': 192,
'tires': 189,
'march': 188,
'order': 186,
'limit': 185,
'current': 181,
'days': 181,
'check': 176,
'could': 171,
'travel': 169,
'going': 167,
'give': 165,
'calendar': 161,
'insurance': 160,
'playlist': 159,
'say': 158,
'id': 156,
'day': 155,
'score': 155,
'request': 153,
'report': 153,
'today': 144,
'ai': 141,
'location': 141,
'schedule': 140,
'right': 137,
'rate': 136,
'meeting': 135,
'lost': 127,
'see': 121,
'things': 120,
'points': 118,
'chase': 117,
'dollars': 116,
'got': 115,
'pm': 113,
'5': 111,
'pto': 111,
'status': 109,
'reservations': 109,
'reminder': 108,
'checking': 107,
'amount': 107,
'2': 106,
'4': 105,
'start': 105,
'timezone': 104,
'kind': 103,
'way': 102,
'dont': 102,
'transaction': 100,
'401k': 100,
'routing': 100,
'money': 99,
'look': 99,
'taxes': 99,
'pin': 99,
'life': 98,
'roll': 98,
'settings': 98,
'uber': 98,
'direct': 97,
'apr': 97,
'traffic': 97,
'due': 96,
'luggage': 96,
'interest': 96,
'book': 95,
'text': 95,
'show': 94,
'changed': 94,
'alarm': 93,
'play': 93,
'meaning': 92,
'work': 92,
'send': 91,
'plan': 91,
'dinner': 90,
'voice': 90,
'place': 89,
'application': 89,
'timer': 88,
'increase': 87,
'payment': 87,
'coin': 87,
'deposit': 86,
'month': 86,
'balance': 85,
'rewards': 85,
'volume': 85,
'cancel': 85,
'tomorrow': 84,
'hear': 82,
'may': 81,
'w2': 81,
'whisper': 80,
'america': 78,
'made': 78,
'paycheck': 78,
'flip': 78,
'think': 77,
'people': 76,
'dice': 76,
'minimum': 76,
'na': 75,
'switch': 75,
'10': 74,
'confirm': 74,
'calories': 74,
'stop': 73,
'reviews': 73,
'savings': 72,
'back': 72,
'type': 72,
'declined': 72,
'american': 71,
'wan': 70,
'fun': 70,
'6': 69,
'turn': 69,
'jump': 69,
'milk': 66,
'apply': 66,
'buy': 66,
'chicken': 66,
'factory': 66,
'talk': 65,
'transactions': 65,
'spell': 65,
'left': 64,
'something': 64,
'table': 64,
'meetings': 64,
'hotel': 63,
'soon': 63,
'person': 63,
'transfer': 63,
'possible': 63,
'total': 63,
'male': 63,
'red': 62,
'3': 61,
'1st': 61,
'8': 61,
'used': 61,
'fuel': 61,
'carry': 61,
'ingredients': 61,
'speak': 60,
'must': 60,
'discover': 60,
'mastercard': 60,
'room': 60,
'fraudulent': 60,
'delta': 59,
'thats': 59,
'shots': 59,
'bus': 58,
'answer': 57,
'cant': 57,
'water': 57,
'fees': 57,
'recipe': 57,
'pets': 57,
'everything': 56,
'1': 56,
'pizza': 56,
'year': 56,
'checks': 56,
'alerts': 55,
'april': 55,
'using': 55,
'amex': 55,
'todo': 55,
'someone': 54,
'scheduled': 54,
'food': 54,
'cook': 54,
'pressure': 54,
'international': 54,
'french': 53,
'sure': 53,
'airlines': 53,
'since': 53,
'traveling': 52,
'plug': 52,
'city': 51,
'share': 51,
'ive': 51,
'income': 51,
'benefits': 51,
'fridge': 51,
'week': 50,
'weather': 50,
'stolen': 50,
'air': 50,
'mode': 50,
'ask': 49,
'remind': 49,
'read': 49,
'paid': 49,
'cars': 49,
'restaurant': 49,
'mpg': 49,
'hobbies': 49,
'able': 48,
'repeat': 48,
'language': 48,
'available': 48,
'rent': 48,
'bills': 48,
'another': 47,
'rental': 47,
'health': 47,
'replace': 47,
'job': 47,
'yet': 47,
'sided': 47,
'monday': 47,
'rollover': 47,
'spend': 47,
'ahead': 46,
'form': 46,
'dallas': 46,
'miles': 46,
'spent': 46,
'reminders': 46,
'converter': 46,
'joke': 46,
'lets': 45,
'minutes': 45,
'home': 45,
'enough': 45,
'battery': 45,
'yes': 44,
'inform': 44,
'state': 44,
'remove': 44,
'eat': 44,
'hold': 43,
'mexico': 43,
'fact': 43,
'talking': 43,
'wait': 43,
'approved': 43,
'info': 43,
'busy': 43,
'locate': 42,
'friday': 42,
'around': 42,
'york': 42,
'airport': 42,
'tax': 42,
'appointment': 42,
'meal': 42,
'checkbooks': 42,
'currently': 41,
'robin': 41,
'holiday': 41,
'chicago': 41,
'restaurants': 41,
'cable': 41,
'funny': 41,
'exchange': 41,
'gps': 41,
'hello': 40,
'two': 40,
'7': 40,
'remember': 40,
'recent': 40,
'suggest': 40,
'yesterday': 40,
'tire': 40,
'spanish': 39,
'really': 39,
'january': 39,
'february': 39,
'thank': 38,
'real': 38,
'information': 38,
'cards': 38,
'express': 38,
'tails': 38,
'expire': 38,
'accent': 38,
'needs': 37,
'best': 37,
'instead': 37,
'old': 37,
'called': 37,
'later': 37,
'electric': 37,
'british': 37,
'shop': 36,
'expect': 36,
'true': 36,
'near': 36,
'thanks': 36,
'nutritional': 36,
'carryon': 36,
'coordinates': 36,
'accounts': 35,
'safe': 35,
'company': 35,
'tuesday': 35,
'olive': 35,
'playing': 35,
'bad': 35,
'cash': 34,
'getting': 34,
'june': 34,
'frozen': 34,
'garden': 34,
'charge': 34,
'music': 34,
'youre': 33,
'federal': 33,
'setting': 33,
'activity': 33,
'taken': 33,
'spending': 33,
'light': 33,
'nutrition': 33,
'inches': 33,
'country': 32,
'facts': 32,
'replaced': 32,
'replacement': 32,
'anything': 32,
'flights': 32,
'hey': 32,
'cream': 32,
'far': 32,
'owe': 32,
'expiration': 32,
'word': 31,
'cat': 31,
'capital': 31,
'process': 31,
'alert': 31,
'policy': 31,
'steps': 31,
'low': 31,
'visit': 31,
'first': 31,
'trivia': 31,
'born': 31,
'reserve': 31,
'wells': 31,
'fargo': 31,
'la': 31,
'block': 31,
'redeem': 31,
'ons': 31,
'substitute': 31,
'convert': 31,
'definition': 30,
'point': 30,
'figure': 30,
'recently': 30,
'free': 30,
'tonight': 30,
'closest': 30,
'trip': 30,
'seattle': 30,
'package': 30,
'slow': 30,
'reset': 30,
'italian': 29,
'20': 29,
'ill': 29,
'create': 29,
'canada': 29,
'12': 29,
'keep': 29,
'refer': 29,
'saved': 29,
'lot': 29,
'yen': 29,
'said': 28,
'500': 28,
'level': 28,
'travelling': 28,
'vegas': 28,
'come': 28,
'times': 28,
'human': 28,
'dead': 28,
'love': 27,
'hows': 27,
'france': 27,
'high': 27,
'dog': 27,
'update': 27,
'3rd': 27,
'mailed': 27,
'kinds': 27,
'steakhouse': 27,
'noon': 27,
'freeze': 27,
'phoenix': 27,
'didnt': 27,
'nearest': 27,
'correct': 26,
'english': 26,
'different': 26,
'track': 26,
'us': 26,
'mail': 26,
'needed': 26,
'miami': 26,
'walmart': 26,
'purchase': 26,
'ratings': 26,
'message': 26,
'expires': 26,
'pounds': 26,
'ta': 25,
'united': 25,
'looking': 25,
'instructions': 25,
'types': 25,
'false': 25,
'outback': 25,
'online': 25,
'reward': 25,
'starbucks': 25,
'ab123': 25,
'tank': 25,
'items': 25,
'restrictions': 25,
'maintenance': 25,
'engine': 25,
'1234': 25,
'five': 24,
'mom': 24,
'mean': 24,
'connect': 24,
'changing': 24,
'todays': 24,
'arrive': 24,
'whens': 24,
'houston': 24,
'original': 24,
'still': 24,
'sushi': 24,
'required': 24,
'disconnect': 24,
'socket': 24,
'bread': 23,
'making': 23,
'assistance': 23,
'ways': 23,
'southwest': 23,
'heads': 23,
'cheese': 23,
'2019': 23,
'san': 23,
'boston': 23,
'cake': 23,
'apple': 23,
'pie': 23,
'female': 23,
'working': 23,
'boss': 23,
'ten': 22,
'x': 22,
'15': 22,
'die': 22,
'’': 22,
'south': 22,
'africa': 22,
'rating': 22,
'longer': 22,
'takes': 22,
'processed': 22,
'require': 22,
'trying': 22,
'suggestion': 22,
'paying': 22,
'denver': 22,
'cleaning': 22,
'speed': 22,
'calorie': 22,
'map': 22,
'returning': 22,
'translate': 21,
'italy': 21,
'walk': 21,
'interesting': 21,
'eggs': 21,
'computer': 21,
'office': 21,
'speaking': 21,
'appreciate': 21,
'coming': 21,
'spaghetti': 21,
'mall': 21,
'laundry': 21,
'skip': 21,
'charged': 21,
'nice': 21,
'per': 21,
'content': 21,
'event': 21,
'goes': 21,
'damaged': 21,
'centimeters': 21,
'vaccinations': 21,
'tourist': 20,
'minute': 20,
'explain': 20,
'paris': 20,
'las': 20,
'japan': 20,
'9': 20,
'zone': 20,
'else': 20,
'delete': 20,
'sort': 20,
'provide': 20,
'tampa': 20,
'gallon': 20,
'speech': 20,
'healthy': 20,
'rooms': 20,
'enjoy': 20,
'roundtrip': 20,
'bot': 20,
'move': 19,
'wish': 19,
'four': 19,
'exactly': 19,
'theres': 19,
'reason': 19,
'search': 19,
'russia': 19,
'2nd': 19,
'august': 19,
'cats': 19,
'questions': 19,
'dogs': 19,
'los': 19,
'looked': 19,
'bob': 19,
'away': 19,
'already': 19,
'sugar': 19,
'either': 19,
'applebees': 19,
'macaroni': 19,
'station': 19,
'beef': 19,
'wednesday': 19,
'specific': 19,
'conversion': 19,
'chinese': 18,
'goodbye': 18,
'means': 18,
'100': 18,
'assist': 18,
'spain': 18,
'allowed': 18,
'dates': 18,
'11': 18,
'july': 18,
'raise': 18,
'great': 18,
'angeles': 18,
'swap': 18,
'salary': 18,
'checked': 18,
'dl123': 18,
'leave': 18,
'calling': 18,
'john': 18,
'subjects': 18,
'vehicle': 18,
'drive': 18,
'square': 18,
'target': 18,
'bring': 18,
'lately': 18,
'highway': 18,
'task': 18,
'birthday': 18,
'stay': 18,
'freezer': 18,
'1000': 18,
'temperature': 18,
'age': 18,
'morning': 17,
'england': 17,
'germany': 17,
'amazon': 17,
'count': 17,
'latest': 17,
'places': 17,
'needing': 17,
'lobster': 17,
'mcdonalds': 17,
'chilis': 17,
'pair': 17,
'saturday': 17,
'earned': 17,
'faster': 17,
'fill': 17,
'okay': 17,
'lights': 17,
'market': 16,
'pet': 16,
'house': 16,
'china': 16,
'improve': 16,
'ago': 16,
'soda': 16,
'maybe': 16,
'party': 16,
'kevin': 16,
'try': 16,
'fee': 16,
'clear': 16,
'slower': 16,
'chocolate': 16,
'pesos': 16,
'listening': 16,
'booked': 16,
'jokes': 16,
'route': 16,
'hawaii': 15,
'30': 15,
'aware': 15,
'7th': 15,
'5th': 15,
'hurt': 15,
'understand': 15,
'ok': 15,
'created': 15,
'live': 15,
'carrots': 15,
'groceries': 15,
'added': 15,
'ice': 15,
'ran': 15,
'well': 15,
'chris': 15,
'rock': 15,
'placed': 15,
'austin': 15,
'suggestions': 15,
'mileage': 15,
'recommended': 15,
'pork': 15,
'salt': 15,
'plugs': 15,
'rubles': 15,
'speaker': 15,
'wheres': 14,
'mexican': 14,
'complete': 14,
'200': 14,
'lower': 14,
'korea': 14,
'london': 14,
'years': 14,
'4th': 14,
'black': 14,
'thing': 14,
'delivered': 14,
'receive': 14,
'ordered': 14,
'mind': 14,
'exact': 14,
'idea': 14,
'steve': 14,
'birth': 14,
'include': 14,
'tasks': 14,
'movies': 14,
'blocked': 14,
'citibank': 14,
'whether': 14,
'detroit': 14,
'fast': 14,
'orlando': 14,
'burger': 14,
'steak': 14,
'store': 14,
'euros': 14,
'locked': 14,
'fraud': 14,
'ounces': 14,
'hi': 13,
'german': 13,
'brazil': 13,
'navy': 13,
'400': 13,
'debit': 13,
'might': 13,
'changes': 13,
'build': 13,
'respond': 13,
'address': 13,
'payday': 13,
'reach': 13,
'land': 13,
'board': 13,
'pick': 13,
'prefer': 13,
'root': 13,
'5pm': 13,
'800': 13,
'immediately': 13,
'remaining': 13,
'thursday': 13,
'rolled': 13,
'thai': 13,
'tv': 13,
'seems': 13,
'dishes': 13,
'potato': 13,
'wrong': 13,
'half': 13,
'mechanic': 13,
'bake': 13,
'cups': 13,
'ihop': 13,
'favorite': 13,
'fan': 13,
'return': 13,
'atm': 12,
'learn': 12,
'b': 12,
'run': 12,
'auto': 12,
'finding': 12,
'somewhere': 12,
'8th': 12,
'submit': 12,
'pull': 12,
'average': 12,
'given': 12,
'tokyo': 12,
'weeks': 12,
'little': 12,
'baby': 12,
'extra': 12,
'rice': 12,
'sorts': 12,
'programmed': 12,
'taco': 12,
'three': 12,
'allow': 12,
'doctors': 12,
'reno': 12,
'tune': 12,
'dollar': 12,
'nashville': 12,
'lunch': 12,
'roast': 12,
'lax': 12,
'typically': 12,
'salad': 12,
'baking': 12,
'ones': 12,
'door': 12,
'beatles': 12,
'50': 11,
'starting': 11,
'six': 11,
'cell': 11,
'located': 11,
'ireland': 11,
'12th': 11,
'notify': 11,
'anyone': 11,
'discovery': 11,
'jim': 11,
'uk': 11,
'procedure': 11,
'listen': 11,
'grill': 11,
'pittsburgh': 11,
'believe': 11,
'francisco': 11,
'purchases': 11,
'open': 11,
'king': 11,
'missing': 11,
'activities': 11,
'big': 11,
'associated': 11,
'clean': 11,
'full': 11,
'soup': 11,
'sunday': 11,
'feet': 11,
'rules': 11,
'unsync': 11,
'oven': 11,
'dial': 11,
'ac': 11,
'title': 11,
'forecast': 11,
'vaccines': 11,
'directions': 11,
'spelling': 11,
'deposited': 11,
'grocery': 10,
'meet': 10,
'bar': 10,
'bye': 10,
'bathroom': 10,
'carrie': 10,
'payments': 10,
'define': 10,
'obtain': 10,
'recommend': 10,
'care': 10,
'australia': 10,
'kenya': 10,
'15th': 10,
'november': 10,
'showing': 10,
'decrease': 10,
'shoes': 10,
'bit': 10,
'default': 10,
'supposed': 10,
'central': 10,
'rome': 10,
'app': 10,
'gone': 10,
'accepted': 10,
'present': 10,
'begin': 10,
'arrived': 10,
'choose': 10,
'came': 10,
'names': 10,
'bananas': 10,
'potatoes': 10,
'trash': 10,
'butter': 10,
'sorry': 10,
'cost': 10,
'necessary': 10,
'beach': 10,
'liberty': 10,
'atlanta': 10,
'brooklyn': 10,
'charges': 10,
'access': 10,
'ideas': 10,
'nyc': 10,
'kentucky': 10,
'pasta': 10,
'earn': 10,
'calculate': 10,
'looks': 10,
'louder': 10,
'economy': 10,
'actually': 10,
'thailand': 10,
'union': 10,
'wendys': 10,
'casserole': 10,
'sour': 10,
'tom': 10,
'broken': 10,
'anymore': 10,
'usd': 10,
'meters': 10,
'rain': 10,
'doo': 10,
'japanese': 9,
'west': 9,
'usaa': 9,
'regular': 9,
'cookies': 9,
'eight': 9,
'purpose': 9,
'device': 9,
'cellphone': 9,
'listed': 9,
'europe': 9,
'october': 9,
'december': 9,
'weekend': 9,
'6th': 9,
'late': 9,
'james': 9,
'post': 9,
'moment': 9,
'landing': 9,
'mary': 9,
'dave': 9,
'flour': 9,
'bag': 9,
'bags': 9,
'rid': 9,
'cause': 9,
'forget': 9,
'familiar': 9,
'close': 9,
'700': 9,
'td': 9,
'6pm': 9,
'action': 9,
'rolling': 9,
'serve': 9,
'often': 9,
'evans': 9,
'annual': 9,
'brownies': 9,
'lasagna': 9,
'mashed': 9,
'cup': 9,
'shower': 9,
'divided': 9,
'internationally': 9,
'sub': 9,
'friend': 9,
'6am': 9,
'1100': 9,
'cookie': 9,
'increased': 9,
'higher': 9,
'lock': 9,
'kilos': 9,
'tablespoons': 9,
'doors': 9,
'revert': 9,
'directly': 9,
'electricity': 9,
'subway': 8,
'second': 8,
'saving': 8,
'sent': 8,
'healthcare': 8,
'options': 8,
'figuring': 8,
'turkey': 8,
'20th': 8,
'17th': 8,
'25th': 8,
'heard': 8,
'regarding': 8,
'history': 8,
'response': 8,
...}
# Converting all the intents to numerical encodings as part of target column in train set
train_df['Target'] = train_df.intent.astype('category').cat.codes
# Pre-processing on validation set
val_df['cleaned_text'] = val_df['text'].apply(lambda i :"".join(t.lower() for t in i if t not in string.punctuation))
val_df['cleaned_tokenized_text'] = val_df['cleaned_text'].apply(nltk.tokenize.word_tokenize)
val_df['cleaned_tokenized_text'] = val_df['cleaned_tokenized_text'].apply(lambda x: [i for i in x if i not in en_stops])
val_df
| text | intent | cleaned_text | cleaned_tokenized_text | |
|---|---|---|---|---|
| 0 | in spanish, meet me tomorrow is said how | translate | in spanish meet me tomorrow is said how | [spanish, meet, tomorrow, said] |
| 1 | in french, how do i say, see you later | translate | in french how do i say see you later | [french, say, see, later] |
| 2 | how do you say hello in japanese | translate | how do you say hello in japanese | [say, hello, japanese] |
| 3 | how do i ask about the weather in chinese | translate | how do i ask about the weather in chinese | [ask, weather, chinese] |
| 4 | how can i say "cancel my order" in french | translate | how can i say cancel my order in french | [say, cancel, order, french] |
| ... | ... | ... | ... | ... |
| 2995 | i was at walmart trying to buy toilet paper an... | card_declined | i was at walmart trying to buy toilet paper an... | [walmart, trying, buy, toilet, paper, card, go... |
| 2996 | target declined my card and i don't know why | card_declined | target declined my card and i dont know why | [target, declined, card, dont, know] |
| 2997 | do you know why my card was declined at target... | card_declined | do you know why my card was declined at target... | [know, card, declined, target, cant, figure] |
| 2998 | i can't figure out why my card was declined at... | card_declined | i cant figure out why my card was declined at ... | [cant, figure, card, declined, target] |
| 2999 | i was just at target and they declined my card... | card_declined | i was just at target and they declined my card... | [target, declined, card, cant, understand] |
3000 rows × 4 columns
# Converting all the intents to numerical encodings as part of target column in val set
val_df['Target'] = val_df.intent.astype('category').cat.codes
# Pre-processing on val set
test_df['cleaned_text'] = test_df['text'].apply(lambda i :"".join(t.lower() for t in i if t not in string.punctuation))
test_df['cleaned_tokenized_text'] = test_df['cleaned_text'].apply(nltk.tokenize.word_tokenize)
test_df['cleaned_tokenized_text'] = test_df['cleaned_tokenized_text'].apply(lambda x: [i for i in x if i not in en_stops])
test_df
| text | intent | cleaned_text | cleaned_tokenized_text | |
|---|---|---|---|---|
| 0 | how would you say fly in italian | translate | how would you say fly in italian | [would, say, fly, italian] |
| 1 | what's the spanish word for pasta | translate | whats the spanish word for pasta | [whats, spanish, word, pasta] |
| 2 | how would they say butter in zambia | translate | how would they say butter in zambia | [would, say, butter, zambia] |
| 3 | how do you say fast in spanish | translate | how do you say fast in spanish | [say, fast, spanish] |
| 4 | what's the word for trees in norway | translate | whats the word for trees in norway | [whats, word, trees, norway] |
| ... | ... | ... | ... | ... |
| 4495 | why can't i use my credit card | card_declined | why cant i use my credit card | [cant, use, credit, card] |
| 4496 | why won't you let me pay with my credit card | card_declined | why wont you let me pay with my credit card | [wont, let, pay, credit, card] |
| 4497 | why did i get rejected on my card | card_declined | why did i get rejected on my card | [get, rejected, card] |
| 4498 | how come my credit card isn't working | card_declined | how come my credit card isnt working | [come, credit, card, isnt, working] |
| 4499 | why didn't my card work | card_declined | why didnt my card work | [didnt, card, work] |
4500 rows × 4 columns
# Converting all the intents to numerical encodings as part of target column in test set
test_df['Target'] = test_df.intent.astype('category').cat.codes
# Grouping the dataframe per intent
temp_df = train_df[['text', 'tokenized_text', 'intent','cleaned_text','cleaned_tokenized_text']]
grouped_train = temp_df.groupby('intent').sum()
# Function to generate word frequencies per intent and the plot
def word_freq(intent):
word_freq = Counter(grouped_train['cleaned_tokenized_text'][intent])
sorted_word_freq = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1], reverse=True)}
sorted_word_df = pd.DataFrame.from_records(list(dict(sorted_word_freq).items())[:15], columns=['words','count'])
sorted_word_df = sorted_word_df.sort_values(by='count')
return px.bar(data_frame=sorted_word_df,y='words',x='count',orientation='h',color_discrete_sequence=['#008080'],opacity=0.8,width=900,height=500, title=f'"{intent}" intent_word frequencies')
word_freq('alarm')
word_freq('accept_reservations')
word_freq('account_blocked')
word_freq('book_flight')
word_freq('order_status')
word_freq('change_speed')
# Top Bigrams
bigrams = ngrams_top(train_df.text,(2,2)).sort_values(by='count')
px.bar(data_frame=bigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#dc3912'],opacity=0.8,width=900,height=500, title='Bigram Bar Chart')
# Top Trigrams
trigrams = ngrams_top(train_df.text,(3,3)).sort_values(by='count')
px.bar(data_frame=trigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#f58518'],opacity=0.8,width=900,height=500, title='Trigrams Bar Chart')
# Top Four Words
four_words = ngrams_top(train_df.text,(4,4)).sort_values(by='count')
px.bar(data_frame=four_words,y='text',x='count',orientation='h',color_discrete_sequence=['#f58518'],opacity=0.8,width=900,height=500, title='Four Words Bar Chart')
# Top Bigrams for specific intent car_rental
bigrams = ngrams_top([grouped_train['text']['car_rental']],(2,2)).sort_values(by='count')
px.bar(data_frame=bigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#FFA07A'],opacity=0.8,width=900,height=500, title='Bigrams Bar Chart_Per Intent-"car_Rental"')
# Top Trigrams for specific intent play_music
trigrams = ngrams_top([grouped_train['cleaned_text']['transfer']],(3,3)).sort_values(by='count')
px.bar(data_frame=trigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#DE3163'],opacity=0.8,width=900,height=500, title='Trigrams Bar Chart_Per Intent-"transfer"')
# Top Four Words for specific intent next_holiday
four_words = ngrams_top([grouped_train['cleaned_text']['restaurant_reservation']],(4,4)).sort_values(by='count')
px.bar(data_frame=four_words,y='text',x='count',orientation='h',color_discrete_sequence=['#7A33FF'],opacity=0.8,width=900,height=500, title='Four Words Bar Chart_Per Intent-"next_holiday"')
def generate_wordcloud(all_words):
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=100, relative_scaling=0.5, colormap='Dark2').generate(all_words)
plt.figure(figsize=(14, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
# WordCloud for the whole text of the dataframe
all_words = ' '.join([item for item in train_df['text']])
generate_wordcloud(all_words)
# WordCloud for a specific intent card_declined
all_words = ' '.join([item for item in grouped_train['cleaned_tokenized_text']['card_declined']])
generate_wordcloud(all_words)
all_words = ' '.join([item for item in grouped_train['cleaned_tokenized_text']['nutrition_info']])
generate_wordcloud(all_words)
money_list = ['dollars', '$', 'cents', 'euros', 'usd', 'peso', 'pesos','dollar','rupees','rupee']
cnt = 0
money_corpus = []
for i in train_df['cleaned_text']:
if any(word in i for word in money_list):
money_corpus.append(i)
else:
pass
len(money_corpus)
143
money_corpus_df = pd.DataFrame()
money_corpus_df['text'] = money_corpus
#money_corpus_df.to_csv('money_corpus.csv', index=False)
#train_df.to_csv('train_data.csv', index=False)
#val_df.to_csv('val_data.csv', index=False)
#test_df.to_csv('test_data.csv', index=False)
# Specifying the seed so that we will always have the same rows
np.random.seed(0)
# Randomly selecting 3 unique rows of the target column (target -> numerical encoded of intents column) from the dataframe
three_intents_traindf = train_df[train_df['Target'].isin(train_df['Target'].sample(n=3))]
print('Three Intents train df_Target col value counts: ')
print(three_intents_traindf.Target.value_counts())
print()
print('Three Intents train df_intents col value counts: ')
print(three_intents_traindf.intent.value_counts())
print()
print('Three Intents train df_Target col unique values: ')
print(three_intents_traindf.Target.unique())
print()
print('Three Intents train df shape: ')
print(three_intents_traindf.shape)
Three Intents train df_Target col value counts: 42 100 56 100 5 100 Name: Target, dtype: int64 Three Intents train df_intents col value counts: flight_status 100 ingredients_list 100 are_you_a_bot 100 Name: intent, dtype: int64 Three Intents train df_Target col unique values: [42 56 5] Three Intents train df shape: (300, 6)
# selecting those 3 rows of the target column (target -> numerical encoded of intents column) from the dataframe based on train's target values
three_intents_valdf = val_df[val_df['Target'].isin(three_intents_traindf.Target.unique())]
print('Three Intents val df_Target col value counts: ')
print(three_intents_valdf.Target.value_counts())
print()
print('Three Intents val df_intents col value counts: ')
print(three_intents_valdf.intent.value_counts())
print()
print('Three Intents val df_Target col unique values: ')
print(three_intents_valdf.Target.unique())
print()
print('Three Intents val df shape: ')
print(three_intents_valdf.shape)
Three Intents val df_Target col value counts: 42 20 56 20 5 20 Name: Target, dtype: int64 Three Intents val df_intents col value counts: flight_status 20 ingredients_list 20 are_you_a_bot 20 Name: intent, dtype: int64 Three Intents val df_Target col unique values: [42 56 5] Three Intents val df shape: (60, 5)
# selecting those 3 rows of the target column (target -> numerical encoded of intents column) from the dataframe based on train's target values
three_intents_testdf = test_df[test_df['Target'].isin(three_intents_traindf.Target.unique())]
print('Three Intents test df_Target col value counts: ')
print(three_intents_testdf.Target.value_counts())
print()
print('Three Intents test df_intents col value counts: ')
print(three_intents_testdf.intent.value_counts())
print()
print('Three Intents test df_Target col unique values: ')
print(three_intents_testdf.Target.unique())
print()
print('Three Intents test df shape: ')
print(three_intents_testdf.shape)
Three Intents test df_Target col value counts: 42 30 56 30 5 30 Name: Target, dtype: int64 Three Intents test df_intents col value counts: flight_status 30 ingredients_list 30 are_you_a_bot 30 Name: intent, dtype: int64 Three Intents test df_Target col unique values: [42 56 5] Three Intents test df shape: (90, 5)
# Saving the 3 intents dataframes
three_intents_traindf.to_csv('three_intents_train_data.csv', index=False)
three_intents_valdf.to_csv('three_intents_val_data.csv', index=False)
three_intents_testdf.to_csv('three_intents_test_data.csv', index=False)
three_intents_traindf = pd.read_csv('three_intents_train_data.csv')
three_intents_valdf = pd.read_csv('three_intents_val_data.csv')
three_intents_testdf = pd.read_csv('three_intents_test_data.csv')
word_freq(three_intents_traindf.intent.unique()[0])
word_freq(three_intents_traindf.intent.unique()[1])
word_freq(three_intents_traindf.intent.unique()[2])
three_intents_traindf
| text | intent | tokenized_text | cleaned_text | cleaned_tokenized_text | Target | |
|---|---|---|---|---|---|---|
| 1600 | what is the projected time frame for the fligh... | flight_status | [what, is, the, projected, time, frame, for, t... | what is the projected time frame for the fligh... | [projected, time, frame, flight, land] | 42 |
| 1601 | is there any news on flight dl123 | flight_status | [is, there, any, news, on, flight, dl123] | is there any news on flight dl123 | [news, flight, dl123] | 42 |
| 1602 | what is the word on flight dl123 | flight_status | [what, is, the, word, on, flight, dl123] | what is the word on flight dl123 | [word, flight, dl123] | 42 |
| 1603 | when will we begin to board my scheduled flight | flight_status | [when, will, we, begin, to, board, my, schedul... | when will we begin to board my scheduled flight | [begin, board, scheduled, flight] | 42 |
| 1604 | whats the status of my frontier flight | flight_status | [whats, the, status, of, my, frontier, flight] | whats the status of my frontier flight | [whats, status, frontier, flight] | 42 |
| ... | ... | ... | ... | ... | ... | ... |
| 13395 | tell me if you are a human or a computer | are_you_a_bot | [tell, me, if, you, are, a, human, or, a, comp... | tell me if you are a human or a computer | [tell, human, computer] | 5 |
| 13396 | can you specify if you are a human or a computer | are_you_a_bot | [can, you, specify, if, you, are, a, human, or... | can you specify if you are a human or a computer | [specify, human, computer] | 5 |
| 13397 | let me know if you are a human or a computer | are_you_a_bot | [let, me, know, if, you, are, a, human, or, a,... | let me know if you are a human or a computer | [let, know, human, computer] | 5 |
| 13398 | is this a computer right now or a human being | are_you_a_bot | [is, this, a, computer, right, now, or, a, hum... | is this a computer right now or a human being | [computer, right, human] | 5 |
| 13399 | is this human or a robot | are_you_a_bot | [is, this, human, or, a, robot] | is this human or a robot | [human, robot] | 5 |
300 rows × 6 columns
# Top Bigrams for specific intent
bigrams = ngrams_top([grouped_train['text'][three_intents_traindf.intent.unique()[0]]],(2,2)).sort_values(by='count')
px.bar(data_frame=bigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#FFA07A'],opacity=0.8,width=900,height=500, title=f'Bigrams Bar Chart_For Intent:-{three_intents_traindf.intent.unique()[0]}')
bigrams = ngrams_top([grouped_train['text'][three_intents_traindf.intent.unique()[1]]],(2,2)).sort_values(by='count')
px.bar(data_frame=bigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#f58518'],opacity=0.8,width=900,height=500, title=f'Bigrams Bar Chart_For Intent:-{three_intents_traindf.intent.unique()[1]}')
bigrams = ngrams_top([grouped_train['text'][three_intents_traindf.intent.unique()[2]]],(2,2)).sort_values(by='count')
px.bar(data_frame=bigrams,y='text',x='count',orientation='h',color_discrete_sequence=['#DE3163'],opacity=0.8,width=900,height=500, title=f'Bigrams Bar Chart_For Intent:-{three_intents_traindf.intent.unique()[2]}')
all_words = ' '.join([item for item in grouped_train['cleaned_tokenized_text'][three_intents_traindf.intent.unique()[0]]])
print('Intent:-',three_intents_traindf.intent.unique()[0])
generate_wordcloud(all_words)
Intent:- flight_status
all_words = ' '.join([item for item in grouped_train['cleaned_tokenized_text'][three_intents_traindf.intent.unique()[1]]])
print('Intent:-',three_intents_traindf.intent.unique()[1])
generate_wordcloud(all_words)
Intent:- ingredients_list
all_words = ' '.join([item for item in grouped_train['cleaned_tokenized_text'][three_intents_traindf.intent.unique()[2]]])
print('Intent:-',three_intents_traindf.intent.unique()[2])
generate_wordcloud(all_words)
Intent:- are_you_a_bot
# Create Skip Gram model
model_sg = gensim.models.Word2Vec(train_df['cleaned_tokenized_text'], min_count = 1, vector_size = 100, window = 5, sg = 1, workers = 4)
# Saving skipgram model
model_sg.save('word2vec_skipgram_150intents.model')
# Loading skipgram model
#loaded_sg_150_model = Word2Vec.load("word2vec_skipgram_150intents.model")
model_sg.wv.most_similar('time')
[('paid', 0.9738958477973938),
('year', 0.9643425345420837),
('day', 0.9633602499961853),
('miles', 0.9624411463737488),
('holiday', 0.9537367820739746),
('week', 0.952694833278656),
('gallon', 0.9512583613395691),
('far', 0.9502949714660645),
('taken', 0.9500401616096497),
('per', 0.9498676657676697)]
# Building a tf-idf matrix
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in train_df['cleaned_tokenized_text']])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))
building tf-idf matrix ... vocab size : 920
# Function to create an averaged document vector when given a list of tokens of the same document
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += model_sg.wv[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError: # handling the case where the token is not in the corpus. useful for testing.
continue
if count != 0:
vec /= count
return vec
# Building the sentence embeddings for train, val and test sets respectively
train_vecs_w2v = np.concatenate([buildWordVector(z, 100) for z in tqdm(map(lambda x: x, train_df['cleaned_tokenized_text']))])
train_vecs_w2v = scale(train_vecs_w2v)
val_vecs_w2v = np.concatenate([buildWordVector(z, 100) for z in tqdm(map(lambda x: x, val_df['cleaned_tokenized_text']))])
val_vecs_w2v = scale(val_vecs_w2v)
test_vecs_w2v = np.concatenate([buildWordVector(z, 100) for z in tqdm(map(lambda x: x, test_df['cleaned_tokenized_text']))])
test_vecs_w2v = scale(test_vecs_w2v)
15000it [00:00, 60466.51it/s] 3000it [00:00, 61192.60it/s] 4500it [00:00, 59214.45it/s]
model_sg_3intents = gensim.models.Word2Vec(three_intents_traindf['cleaned_tokenized_text'], min_count = 1, vector_size = 100, window = 5, sg = 1, workers = 4)
# Building a tf-idf matrix
print('building tf-idf matrix ...')
vectorizer_3i = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix_3i = vectorizer_3i.fit_transform([x for x in three_intents_traindf['cleaned_tokenized_text']])
tfidf_3i = dict(zip(vectorizer_3i.get_feature_names(), vectorizer_3i.idf_))
print('vocab size :', len(tfidf_3i))
building tf-idf matrix ... vocab size : 20
# For 3 intents
# Function to create an averaged document vector when given a list of tokens of the same document
def buildWordVector_3i(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += model_sg_3intents.wv[word].reshape((1, size)) * tfidf_3i[word]
count += 1.
except KeyError: # handling the case where the token is not in the corpus. useful for testing.
continue
if count != 0:
vec /= count
return vec
# Building the sentence embeddings for train, val and test sets respectively
train_vecs_w2v_3i = np.concatenate([buildWordVector_3i(z, 100) for z in tqdm(map(lambda x: x, three_intents_traindf['cleaned_tokenized_text']))])
train_vecs_w2v_3i = scale(train_vecs_w2v_3i)
val_vecs_w2v_3i = np.concatenate([buildWordVector_3i(z, 100) for z in tqdm(map(lambda x: x, three_intents_valdf['cleaned_tokenized_text']))])
val_vecs_w2v_3i = scale(val_vecs_w2v_3i)
test_vecs_w2v_3i = np.concatenate([buildWordVector_3i(z, 100) for z in tqdm(map(lambda x: x, three_intents_testdf['cleaned_tokenized_text']))])
test_vecs_w2v_3i = scale(test_vecs_w2v_3i)
300it [00:00, 75000.97it/s] 60it [00:00, 54875.32it/s] 90it [00:00, 82366.87it/s]
# initializing countvectorizer model
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
vec = vectorizer.fit_transform(train_df['cleaned_text'])
print(vec.shape)
train_vec_cv = vec.toarray()
np.unique(train_vec_cv)
(15000, 26120)
array([0, 1, 2], dtype=int64)
# Count Vectorize validation set
val_vec_cv = vectorizer.transform(val_df['cleaned_text']).toarray()
val_vec_cv.shape
(3000, 26120)
# Count Vectorize test set
test_vec_cv = vectorizer.transform(test_df['cleaned_text']).toarray()
test_vec_cv.shape
(4500, 26120)
vectorizer_3i = CountVectorizer(analyzer='word', ngram_range=(2, 2))
three_intents_train_vec_cv = vectorizer_3i.fit_transform(three_intents_traindf['cleaned_text']).toarray()
three_intents_train_vec_cv.shape
(300, 713)
three_intents_val_vec_cv = vectorizer_3i.transform(three_intents_valdf['cleaned_text']).toarray()
three_intents_val_vec_cv.shape
(60, 713)
three_intents_test_vec_cv = vectorizer_3i.transform(three_intents_testdf['cleaned_text']).toarray()
three_intents_test_vec_cv.shape
(90, 713)
# For Train set
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
train_sentences_list = []
for sent in train_df['cleaned_text']:
train_sentences_list.append(sent)
#Sentences are encoded by calling model.encode()
train_embeddings = model.encode(train_sentences_list)
train_sent_embeddings = []
#Generate the embeddings
for sentence, embedding in zip(train_sentences_list, train_embeddings):
train_sent_embeddings.append(embedding)
# For Val set
val_sentences_list = []
for sent in val_df['cleaned_text']:
val_sentences_list.append(sent)
#Sentences are encoded by calling model.encode()
val_embeddings = model.encode(val_sentences_list)
val_sent_embeddings = []
#Generate the embeddings
for sentence, embedding in zip(val_sentences_list, val_embeddings):
val_sent_embeddings.append(embedding)
# For Test set
test_sentences_list = []
for sent in test_df['cleaned_text']:
test_sentences_list.append(sent)
#Sentences are encoded by calling model.encode()
test_embeddings = model.encode(test_sentences_list)
test_sent_embeddings = []
#Generate the embeddings
for sentence, embedding in zip(test_sentences_list, test_embeddings):
test_sent_embeddings.append(embedding)
# three_intents train set
train_sentences_list_3i = []
for sent in three_intents_traindf['cleaned_text']:
train_sentences_list_3i.append(sent)
#Sentences are encoded by calling model.encode()
train_embeddings_3i = model.encode(train_sentences_list_3i)
train_sent_embeddings_3i = []
#Generate the embeddings
for sentence, embedding in zip(train_sentences_list_3i, train_embeddings_3i):
train_sent_embeddings_3i.append(embedding)
# three intents Val set
val_sentences_list_3i = []
for sent in three_intents_valdf['cleaned_text']:
val_sentences_list_3i.append(sent)
#Sentences are encoded by calling model.encode()
val_embeddings_3i = model.encode(val_sentences_list_3i)
val_sent_embeddings_3i = []
#Generate the embeddings
for sentence, embedding in zip(val_sentences_list_3i, val_embeddings_3i):
val_sent_embeddings_3i.append(embedding)
# three intents Test set
test_sentences_list_3i = []
for sent in three_intents_testdf['cleaned_text']:
test_sentences_list_3i.append(sent)
#Sentences are encoded by calling model.encode()
test_embeddings_3i = model.encode(test_sentences_list_3i)
test_sent_embeddings_3i = []
#Generate the embeddings
for sentence, embedding in zip(test_sentences_list_3i, test_embeddings_3i):
test_sent_embeddings_3i.append(embedding)
# Training the model for 150 intents
rf = RandomForestClassifier()
rf.fit(train_vecs_w2v, train_df['Target'])
# ROC AUC Score on validation set of rf model for 150 intents
roc_auc_score(val_df['Target'], rf.predict_proba(val_vecs_w2v), multi_class='ovr')
0.9649437360178972
# Training the model for 3 intents
rf_3i = RandomForestClassifier()
rf_3i.fit(train_vecs_w2v_3i, three_intents_traindf['Target'])
# ROC AUC Score on validation set of rf model for 3 intents
roc_auc_score(three_intents_valdf['Target'], rf_3i.predict_proba(val_vecs_w2v_3i), multi_class='ovr')
1.0
def plot_ROC_curve(model, xtrain, ytrain, xtest, ytest, train_set):
# Creating visualization with the readable labels
visualizer = ROCAUC(model, encoder={train_set.Target.unique()[0]: train_set.intent.unique()[0],
train_set.Target.unique()[1]: train_set.intent.unique()[1],
train_set.Target.unique()[2]: train_set.intent.unique()[2]})
# Fitting to the training data first then scoring with the test data
visualizer.fit(xtrain, ytrain)
visualizer.score(xtest, ytest)
visualizer.show()
return visualizer
plot_ROC_curve(rf_3i, train_vecs_w2v_3i, three_intents_traindf['Target'], val_vecs_w2v_3i, three_intents_valdf['Target'], three_intents_traindf)
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for RandomForestClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>,
encoder={5: 'are_you_a_bot', 42: 'flight_status',
56: 'ingredients_list'},
estimator=RandomForestClassifier())
print("Classification Report : \n",classification_report(three_intents_valdf['Target'], rf_3i.predict(val_vecs_w2v_3i)))
Classification Report :
precision recall f1-score support
5 1.00 1.00 1.00 20
42 1.00 1.00 1.00 20
56 1.00 1.00 1.00 20
accuracy 1.00 60
macro avg 1.00 1.00 1.00 60
weighted avg 1.00 1.00 1.00 60
# Saving rf skipgram 150 intents
joblib.dump(rf, "random_forest_sg_150.pkl")
# load
loaded_rf_sg_150 = joblib.load("random_forest_sg_150.pkl")
# Saving rf skipgram 3 intents
joblib.dump(rf_3i, "random_forest_sg_3i.pkl")
# load
loaded_rf_sg_3i = joblib.load("random_forest_sg_3i.pkl")
# F1 score on val set 150 intents
f1_score(val_df['Target'], loaded_rf_sg_150.predict(val_vecs_w2v), average='macro')
0.6624142042982822
# F1 score on val set 3 intents
f1_score(three_intents_valdf['Target'], loaded_rf_sg_3i.predict(val_vecs_w2v_3i), average='macro')
1.0
# Training the model for 150 intents
rf_cv = RandomForestClassifier()
rf_cv.fit(train_vec_cv, train_df['Target'])
RandomForestClassifier()
# ROC AUC Score on validation set of rf model for 150 intents
roc_auc_score(val_df['Target'], rf_cv.predict_proba(val_vec_cv), multi_class='ovr')
0.9686373042505594
# Training the model for 3 intents
rf_3i_cv = RandomForestClassifier().fit(three_intents_train_vec_cv, three_intents_traindf['Target'])
roc_auc_score(three_intents_valdf['Target'], rf_3i_cv.predict_proba(three_intents_val_vec_cv), multi_class='ovr')
0.9997916666666665
plot_ROC_curve(rf_3i_cv, three_intents_train_vec_cv, three_intents_traindf['Target'], three_intents_val_vec_cv, three_intents_valdf['Target'], three_intents_traindf)
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for RandomForestClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>,
encoder={5: 'are_you_a_bot', 42: 'flight_status',
56: 'ingredients_list'},
estimator=RandomForestClassifier())
print("Classification Report : \n",classification_report(three_intents_valdf['Target'], rf_3i_cv.predict(three_intents_val_vec_cv)))
Classification Report :
precision recall f1-score support
5 0.87 1.00 0.93 20
42 1.00 0.90 0.95 20
56 1.00 0.95 0.97 20
accuracy 0.95 60
macro avg 0.96 0.95 0.95 60
weighted avg 0.96 0.95 0.95 60
# Saving rf count vectorizer 150 intents
joblib.dump(rf_cv, "random_forest_cv_150.pkl")
# load
loaded_rf_cv_150 = joblib.load("random_forest_cv_150.pkl")
# Saving rf count vectorizer 3 intents
joblib.dump(rf_3i_cv, "random_forest_cv_3i.pkl")
# load
loaded_rf_3i_cv = joblib.load("random_forest_cv_3i.pkl")
# F1 score on val set 150 intents
f1_score(val_df['Target'], loaded_rf_cv_150.predict(val_vec_cv), average='macro')
0.7460097951152623
# F1 score on val set 3 intents
f1_score(three_intents_valdf['Target'], loaded_rf_3i_cv.predict(three_intents_val_vec_cv), average='macro')
0.9506533178503803
# Training the model for 150 intents
rf_sbert = RandomForestClassifier()
rf_sbert.fit(train_sent_embeddings, train_df['Target'])
# ROC AUC Score on validation set of rf model
roc_auc_score(val_df['Target'], rf_sbert.predict_proba(val_sent_embeddings), multi_class='ovr')
0.9941868568232664
# Training the model for 3 intents
rf_3i_sbert = RandomForestClassifier().fit(train_sent_embeddings_3i, three_intents_traindf['Target'])
roc_auc_score(three_intents_valdf['Target'], rf_3i_sbert.predict_proba(val_sent_embeddings_3i), multi_class='ovr')
1.0
plot_ROC_curve(rf_3i_sbert, train_sent_embeddings_3i, three_intents_traindf['Target'], val_sent_embeddings_3i, three_intents_valdf['Target'], three_intents_traindf)
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for RandomForestClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>,
encoder={5: 'are_you_a_bot', 42: 'flight_status',
56: 'ingredients_list'},
estimator=RandomForestClassifier())
print("Classification Report : \n",classification_report(three_intents_valdf['Target'], rf_3i_sbert.predict(val_sent_embeddings_3i)))
Classification Report :
precision recall f1-score support
5 1.00 1.00 1.00 20
42 1.00 1.00 1.00 20
56 1.00 1.00 1.00 20
accuracy 1.00 60
macro avg 1.00 1.00 1.00 60
weighted avg 1.00 1.00 1.00 60
# Saving rf sentence bert 150 intents
joblib.dump(rf_sbert, "random_forest_sbert_150.pkl")
# load
loaded_rf_sbert_150 = joblib.load("random_forest_sbert_150.pkl")
# Saving rf sentence bert 3 intents
joblib.dump(rf_3i_sbert, "random_forest_sbert_3i.pkl")
# load
loaded_rf_3i_sbert = joblib.load("random_forest_sbert_3i.pkl")
# F1 score on val set 150 intents
f1_score(val_df['Target'], loaded_rf_sbert_150.predict(val_sent_embeddings), average='macro')
0.8880647930717221
# F1 score on val set 3 intents
f1_score(three_intents_valdf['Target'], loaded_rf_3i_sbert.predict(val_sent_embeddings_3i), average='macro')
1.0
# Training the model for 150 intents
lr_sg = LogisticRegression(random_state=0, n_jobs=-1)
lr_sg.fit(train_vecs_w2v, train_df['Target'])
# ROC AUC Score on validation set of lr model for 150 intents
roc_auc_score(val_df['Target'], lr_sg.predict_proba(val_vecs_w2v), multi_class='ovr')
0.9854964205816554
# Training the model for 3 intents
lr_sg_3i = LogisticRegression(random_state=0, n_jobs=-1)
lr_sg_3i.fit(train_vecs_w2v_3i, three_intents_traindf['Target'])
# ROC AUC Score on validation set of lr model for 3 intents
roc_auc_score(three_intents_valdf['Target'], lr_sg_3i.predict_proba(val_vecs_w2v_3i), multi_class='ovr')
1.0
plot_ROC_curve(lr_sg_3i, train_vecs_w2v_3i, three_intents_traindf['Target'], val_vecs_w2v_3i, three_intents_valdf['Target'], three_intents_traindf)
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for LogisticRegression'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>,
encoder={5: 'are_you_a_bot', 42: 'flight_status',
56: 'ingredients_list'},
estimator=LogisticRegression(n_jobs=-1, random_state=0))
print("Classification Report : \n",classification_report(three_intents_valdf['Target'], lr_sg_3i.predict(val_vecs_w2v_3i)))
Classification Report :
precision recall f1-score support
5 1.00 0.85 0.92 20
42 1.00 1.00 1.00 20
56 0.87 1.00 0.93 20
accuracy 0.95 60
macro avg 0.96 0.95 0.95 60
weighted avg 0.96 0.95 0.95 60
# Saving rf skipgram 150 intents
joblib.dump(lr_sg, "logistic_regression_sg_150.pkl")
# load
loaded_lr_sg_150 = joblib.load("logistic_regression_sg_150.pkl")
# Saving rf skipgram 3 intents
joblib.dump(lr_sg_3i, "logistic_regression_sg_3i.pkl")
# load
loaded_lr_sg_3i = joblib.load("logistic_regression_sg_3i.pkl")
## F1 score on val set 150 intents
f1_score(val_df['Target'], loaded_lr_sg_150.predict(val_vecs_w2v), average='macro')
0.7364748076723715
# F1 score on val set 3 intents
f1_score(three_intents_valdf['Target'], loaded_lr_sg_3i.predict(val_vecs_w2v_3i), average='macro')
0.9497171590194845
# Training the lr model for 150 intents
lr_cv = LogisticRegression(random_state=0, n_jobs=-1)
lr_cv.fit(train_vec_cv, train_df['Target'])
LogisticRegression(n_jobs=-1, random_state=0)
# ROC AUC Score on validation set of lr model for 150 intents
roc_auc_score(val_df['Target'], lr_cv.predict_proba(val_vec_cv), multi_class='ovr')
0.9903702460850112
# Training the model for 3 intents
lr_3i_cv = LogisticRegression(random_state=0, n_jobs=-1).fit(three_intents_train_vec_cv, three_intents_traindf['Target'])
roc_auc_score(three_intents_valdf['Target'], lr_3i_cv.predict_proba(three_intents_val_vec_cv), multi_class='ovr')
1.0
plot_ROC_curve(lr_3i_cv, three_intents_train_vec_cv, three_intents_traindf['Target'], three_intents_val_vec_cv, three_intents_valdf['Target'], three_intents_traindf)
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for LogisticRegression'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>,
encoder={5: 'are_you_a_bot', 42: 'flight_status',
56: 'ingredients_list'},
estimator=LogisticRegression(n_jobs=-1, random_state=0))
print("Classification Report : \n",classification_report(three_intents_valdf['Target'], lr_3i_cv.predict(three_intents_val_vec_cv)))
Classification Report :
precision recall f1-score support
5 1.00 1.00 1.00 20
42 1.00 1.00 1.00 20
56 1.00 1.00 1.00 20
accuracy 1.00 60
macro avg 1.00 1.00 1.00 60
weighted avg 1.00 1.00 1.00 60
# Saving lr count vectorizer 150 intents
joblib.dump(lr_cv, "logistic_regression_cv_150.pkl")
# load
loaded_lr_cv_150 = joblib.load("logistic_regression_cv_150.pkl")
# Saving lr count vectorizer 3 intents
joblib.dump(lr_3i_cv, "logistic_regression_cv_3i.pkl")
# load
loaded_lr_cv_3i = joblib.load("logistic_regression_cv_3i.pkl")
# F1 score on val set 150 intents
f1_score(val_df['Target'], loaded_lr_cv_150.predict(val_vec_cv), average='macro')
0.7807001775333827
# F1 score on val set 3 intents
f1_score(three_intents_valdf['Target'], loaded_lr_cv_3i.predict(three_intents_val_vec_cv), average='macro')
1.0
# Training the model for 150 intents
lr_sbert = LogisticRegression(random_state=0, n_jobs=-1)
lr_sbert.fit(train_sent_embeddings, train_df['Target'])
# ROC AUC Score on validation set of rf model
roc_auc_score(val_df['Target'], lr_sbert.predict_proba(val_sent_embeddings), multi_class='ovr')
0.9989079418344519
# Training the model for 3 intents
lr_3i_sbert = LogisticRegression(random_state=0, n_jobs=-1).fit(train_sent_embeddings_3i, three_intents_traindf['Target'])
roc_auc_score(three_intents_valdf['Target'], lr_3i_sbert.predict_proba(val_sent_embeddings_3i), multi_class='ovr')
1.0
plot_ROC_curve(lr_3i_sbert, train_sent_embeddings_3i, three_intents_traindf['Target'], val_sent_embeddings_3i, three_intents_valdf['Target'], three_intents_traindf)
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for LogisticRegression'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>,
encoder={5: 'are_you_a_bot', 42: 'flight_status',
56: 'ingredients_list'},
estimator=LogisticRegression(n_jobs=-1, random_state=0))
print("Classification Report : \n",classification_report(three_intents_valdf['Target'], lr_3i_sbert.predict(val_sent_embeddings_3i)))
Classification Report :
precision recall f1-score support
5 1.00 1.00 1.00 20
42 1.00 1.00 1.00 20
56 1.00 1.00 1.00 20
accuracy 1.00 60
macro avg 1.00 1.00 1.00 60
weighted avg 1.00 1.00 1.00 60
# Saving lr setence bert 150 intents
joblib.dump(lr_sbert, "logistic_regression_sbert_150.pkl")
# load
loaded_lr_sbert_150 = joblib.load("logistic_regression_sbert_150.pkl")
# Saving lr sentence bert 3 intents
joblib.dump(lr_3i_sbert, "logistic_regression_sbert_3i.pkl")
# load
loaded_lr_sbert_3i = joblib.load("logistic_regression_sbert_3i.pkl")
# F1 score on val set 150 intents
f1_score(val_df['Target'], loaded_lr_sbert_150.predict(val_sent_embeddings), average='macro')
0.9472071861576891
# F1 score on val set 3 intents
f1_score(three_intents_valdf['Target'], loaded_lr_sbert_3i.predict(val_sent_embeddings_3i), average='macro')
1.0
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')
from transformers import BertConfig, AutoTokenizer, TFAutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, DistilBertTokenizerFast, DistilBertForTokenClassification
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
cpu
df = pd.read_csv('money_corpus.csv')
df.reset_index(inplace=True)
df.rename(columns={'index':'sentence #'},inplace=True)
df['sentence #'] = df['sentence #'].apply(lambda x : 'sentence:'+str(x))
df['text'] = df['text'].apply(lambda x : x.split())
# df = df.explode('text')
# df.to_csv('exploded_text.csv',index=False)
tagged = pd.read_csv('exploded_text.csv')
tagged
| sentence # | text | tag | |
|---|---|---|---|
| 0 | sentence:0 | how | O |
| 1 | sentence:0 | would | O |
| 2 | sentence:0 | you | O |
| 3 | sentence:0 | say | O |
| 4 | sentence:0 | can | O |
| ... | ... | ... | ... |
| 1327 | sentence:140 | limit | O |
| 1328 | sentence:140 | to | O |
| 1329 | sentence:140 | one | B-mon |
| 1330 | sentence:140 | thousand | I-mon |
| 1331 | sentence:140 | dollars | I-mon |
1332 rows � 3 columns
tagged['sentence'] = tagged[['sentence #','text','tag']].groupby(['sentence #'])['text'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
tagged['word_labels'] = tagged[['sentence #','text','tag']].groupby(['sentence #'])['tag'].transform(lambda x: ','.join(x))
tagged
| sentence # | text | tag | sentence | word_labels | |
|---|---|---|---|---|---|
| 0 | sentence:0 | how | O | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 1 | sentence:0 | would | O | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 2 | sentence:0 | you | O | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 3 | sentence:0 | say | O | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 4 | sentence:0 | can | O | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| ... | ... | ... | ... | ... | ... |
| 1327 | sentence:140 | limit | O | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1328 | sentence:140 | to | O | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1329 | sentence:140 | one | B-mon | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1330 | sentence:140 | thousand | I-mon | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1331 | sentence:140 | dollars | I-mon | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
1332 rows � 5 columns
tagged_final = tagged[['sentence', 'word_labels']]
tagged_final
| sentence | word_labels | |
|---|---|---|
| 0 | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 1 | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 2 | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 3 | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 4 | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| ... | ... | ... |
| 1327 | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1328 | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1329 | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1330 | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 1331 | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
1332 rows � 2 columns
tagged_final = tagged_final.drop_duplicates().reset_index(drop=True)
tagged_final
| sentence | word_labels | |
|---|---|---|
| 0 | how would you say can i borrow five dollars in... | O,O,O,O,O,O,O,B-mon,I-mon,O,O |
| 1 | send 100 dollars between bank of the west and ... | O,B-mon,I-mon,O,O,O,O,O,O,O,O,O,O |
| 2 | send 50 dollars between bank of america and ch... | O,B-mon,I-mon,O,O,O,O,O,O,O |
| 3 | send 2000 dollars between chase and rabobank a... | O,B-mon,I-mon,O,O,O,O,O |
| 4 | send 1200 dollars between usaa and navy federa... | O,B-mon,I-mon,O,O,O,O,O,O |
| ... | ... | ... |
| 136 | can i increase my credit limit to 1100 dollars | O,O,O,O,O,O,O,B-mon,I-mon |
| 137 | can you increase my credit limit to one thousa... | O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 138 | can my credit limit be increased to one thousa... | O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 139 | tell me if my credit limit can be increased to... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
| 140 | please see if you can increase my credit limit... | O,O,O,O,O,O,O,O,O,O,B-mon,I-mon,I-mon |
141 rows � 2 columns
tagged.tag.unique()
array(['O', 'B-mon', 'I-mon'], dtype=object)
label_to_id = {v:k for k,v in enumerate(tagged.tag.unique())}
id_to_label = {k:v for k,v in enumerate(tagged.tag.unique())}
label_to_id
{'O': 0, 'B-mon': 1, 'I-mon': 2}
id_to_label
{0: 'O', 1: 'B-mon', 2: 'I-mon'}
MAX_LEN = 64
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-04
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_size = 0.91
train_dataset = tagged_final.sample(frac=train_size, random_state=0)
test_dataset = tagged_final.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(tagged_final.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
FULL Dataset: (141, 2) TRAIN Dataset: (128, 2) TEST Dataset: (13, 2)
class dataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
# step 1: get the sentence and word labels
sentence = self.data.sentence[index].strip()
word_labels = self.data.word_labels[index].split(',')
non_zero_ids = []
# step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
# BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
encoding = self.tokenizer(sentence,
return_offsets_mapping=True,
padding='max_length',
truncation=True,
max_length=self.max_len)
labels = [label_to_id[label] for label in word_labels]
for id, label in enumerate(labels):
if int(label) > 0:
non_zero_ids.append(id)
# create an empty array of -100 of length max_length
encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
for id in non_zero_ids:
encoded_labels[id] = labels[id]
# step 4: turn everything into PyTorch tensors
item = {key: torch.as_tensor(val) for key, val in encoding.items()}
item['labels'] = torch.as_tensor(encoded_labels)
return item
def __len__(self):
return self.len
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
train_dataset['sentence'][2]
'transfer sixty dollars to dad from my biggest accnt'
train_dataset['word_labels'][2]
'O,B-mon,I-mon,O,O,O,O,O,O'
training_set[2]
{'input_ids': tensor([ 101, 4651, 8442, 6363, 2000, 3611, 2013, 2026, 5221, 16222,
3372, 102, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'offset_mapping': tensor([[ 0, 0],
[ 0, 8],
[ 9, 14],
[15, 22],
[23, 25],
[26, 29],
[30, 34],
[35, 37],
[38, 45],
[46, 49],
[49, 51],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0]]),
'labels': tensor([-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100], dtype=torch.int32)}
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))
model.to(device)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight'] - This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
BertForTokenClassification(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=3, bias=True)
)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
def train(epoch):
loss, accuracy = 0, 0
no_train_samples, no_steps =0 , 0
pred_train, label_train = [], []
model.train()
for i, batch in enumerate(training_loader):
ids = batch['input_ids'].to(device, dtype= torch.long)
mask = batch['attention_mask'].to(device, dtype= torch.long)
labels = batch['labels'].to(device, dtype= torch.long)
res = model(input_ids = ids, attention_mask= mask, labels = labels)
loss += res[0]
no_steps+=1
no_train_samples+=labels.size(0)
if i % 5 ==0:
loss_step = loss/no_steps
print(f"Training loss per 5 training steps: {loss_step}")
flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
active_logits = res[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
# only compute accuracy at active labels
active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
labels = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
label_train.extend(labels)
pred_train.extend(predictions)
tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
accuracy += tmp_tr_accuracy
# gradient clipping
torch.nn.utils.clip_grad_norm_(
parameters=model.parameters(), max_norm=MAX_GRAD_NORM
)
# backward pass
optimizer.zero_grad()
res['loss'].backward()
optimizer.step()
epoch_loss = loss / no_steps
tr_accuracy = accuracy / no_steps
print(f"Training loss epoch: {epoch_loss}")
print(f"Training accuracy epoch: {tr_accuracy}")
for epoch in range(EPOCHS):
print(f"Training epoch: {epoch + 1}")
train(epoch)
Training epoch: 1 Training loss per 5 training steps: 1.2308019399642944 Training loss per 5 training steps: 0.6021385192871094 Training loss per 5 training steps: 0.40730807185173035 Training loss per 5 training steps: 0.3008166551589966 Training loss per 5 training steps: 0.2377745509147644 Training loss epoch: 0.2447919100522995 Training accuracy epoch: 0.8922222222222222 Training epoch: 2 Training loss per 5 training steps: 0.005527029279619455 Training loss per 5 training steps: 0.008797748945653439 Training loss per 5 training steps: 0.008873667567968369 Training loss per 5 training steps: 0.013189241290092468 Training loss per 5 training steps: 0.013651364482939243 Training loss epoch: 0.07044490426778793 Training accuracy epoch: 0.9805555555555556 Training epoch: 3 Training loss per 5 training steps: 0.2950831949710846 Training loss per 5 training steps: 0.09171789139509201 Training loss per 5 training steps: 0.09201046079397202 Training loss per 5 training steps: 0.0827510729432106 Training loss per 5 training steps: 0.06422960013151169 Training loss epoch: 0.0543522983789444 Training accuracy epoch: 0.9755555555555556 Training epoch: 4 Training loss per 5 training steps: 0.0012055047554895282 Training loss per 5 training steps: 0.001303982688114047 Training loss per 5 training steps: 0.04686259850859642 Training loss per 5 training steps: 0.0328807532787323 Training loss per 5 training steps: 0.026509836316108704 Training loss epoch: 0.022478410974144936 Training accuracy epoch: 0.9933333333333333 Training epoch: 5 Training loss per 5 training steps: 0.001204920932650566 Training loss per 5 training steps: 0.004043417051434517 Training loss per 5 training steps: 0.00443153316155076 Training loss per 5 training steps: 0.003648828249424696 Training loss per 5 training steps: 0.0029919843655079603 Training loss epoch: 0.0026236807461827993 Training accuracy epoch: 1.0 Training epoch: 6 Training loss per 5 training steps: 0.000606226094532758 Training loss per 5 training steps: 0.0009039614233188331 Training loss per 5 training steps: 0.0006981437327340245 Training loss per 5 training steps: 0.0006968728266656399 Training loss per 5 training steps: 0.0007433216669596732 Training loss epoch: 0.00072712660767138 Training accuracy epoch: 1.0 Training epoch: 7 Training loss per 5 training steps: 0.0004225874727126211 Training loss per 5 training steps: 0.00040431658271700144 Training loss per 5 training steps: 0.0004310998774599284 Training loss per 5 training steps: 0.0005477844388224185 Training loss per 5 training steps: 0.0005745095550082624 Training loss epoch: 0.0005428720614872873 Training accuracy epoch: nan Training epoch: 8 Training loss per 5 training steps: 0.000567563169170171 Training loss per 5 training steps: 0.00042706061503849924 Training loss per 5 training steps: 0.000506585172843188 Training loss per 5 training steps: 0.00045995015534572303 Training loss per 5 training steps: 0.00047425483353435993 Training loss epoch: 0.0004560407833196223 Training accuracy epoch: 1.0 Training epoch: 9 Training loss per 5 training steps: 0.00030048011103644967 Training loss per 5 training steps: 0.000329753354890272 Training loss per 5 training steps: 0.00036693408037535846 Training loss per 5 training steps: 0.00048753948067314923 Training loss per 5 training steps: 0.0004487417754717171 Training loss epoch: 0.0004223502764943987 Training accuracy epoch: 1.0 Training epoch: 10 Training loss per 5 training steps: 0.00022241912665776908 Training loss per 5 training steps: 0.00029383122455328703 Training loss per 5 training steps: 0.00035656854743137956 Training loss per 5 training steps: 0.0003356702218297869 Training loss per 5 training steps: 0.00033125796471722424 Training loss epoch: 0.0003134088183287531 Training accuracy epoch: 1.0
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_to_id))
model.to(device)
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight'] - This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
DistilBertForTokenClassification(
(distilbert): DistilBertModel(
(embeddings): Embeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(transformer): Transformer(
(layer): ModuleList(
(0): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(1): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(2): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(3): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(4): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(5): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
)
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=3, bias=True)
)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
def train(epoch):
loss, accuracy = 0, 0
no_train_samples, no_steps =0 , 0
pred_train, label_train = [], []
model.train()
for i, batch in enumerate(training_loader):
ids = batch['input_ids'].to(device, dtype= torch.long)
mask = batch['attention_mask'].to(device, dtype= torch.long)
labels = batch['labels'].to(device, dtype= torch.long)
res = model(input_ids = ids, attention_mask= mask, labels = labels)
loss += res[0]
no_steps+=1
no_train_samples+=labels.size(0)
if i % 5 ==0:
loss_step = loss/no_steps
print(f"Training loss per 5 training steps: {loss_step}")
flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
active_logits = res[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
# only compute accuracy at active labels
active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
labels = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
label_train.extend(labels)
pred_train.extend(predictions)
tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
accuracy += tmp_tr_accuracy
# gradient clipping
torch.nn.utils.clip_grad_norm_(
parameters=model.parameters(), max_norm=MAX_GRAD_NORM
)
# backward pass
optimizer.zero_grad()
res['loss'].backward()
optimizer.step()
epoch_loss = loss / no_steps
tr_accuracy = accuracy / no_steps
print(f"Training loss epoch: {epoch_loss}")
print(f"Training accuracy epoch: {tr_accuracy}")
for epoch in range(EPOCHS):
print(f"Training epoch: {epoch + 1}")
train(epoch)
Training epoch: 1 Training loss per 5 training steps: 1.0631675720214844 Training loss per 5 training steps: 0.4809984564781189 Training loss per 5 training steps: 0.4176792800426483 Training loss per 5 training steps: 0.3285141885280609 Training loss per 5 training steps: 0.293161541223526 Training loss per 5 training steps: 0.2414022833108902 Training loss per 5 training steps: 0.22393642365932465 Training loss epoch: 0.2182541936635971 Training accuracy epoch: 0.9361979166666666 Training epoch: 2 Training loss per 5 training steps: 0.008687698282301426 Training loss per 5 training steps: 0.28027164936065674 Training loss per 5 training steps: 0.1997462660074234 Training loss per 5 training steps: 0.14878442883491516 Training loss per 5 training steps: 0.11761162430047989 Training loss per 5 training steps: 0.0993460938334465 Training loss per 5 training steps: 0.08546379953622818 Training loss epoch: 0.08297470211982727 Training accuracy epoch: 0.979389880952381 Training epoch: 3 Training loss per 5 training steps: 0.010782948695123196 Training loss per 5 training steps: 0.005648698657751083 Training loss per 5 training steps: 0.004410024266690016 Training loss per 5 training steps: 0.054690029472112656 Training loss per 5 training steps: 0.04390803351998329 Training loss per 5 training steps: 0.03783927485346794 Training loss per 5 training steps: 0.03298785537481308 Training loss epoch: 0.03208524361252785 Training accuracy epoch: 0.9947916666666667 Training epoch: 4 Training loss per 5 training steps: 0.008631018921732903 Training loss per 5 training steps: 0.004966350272297859 Training loss per 5 training steps: 0.004407668951898813 Training loss per 5 training steps: 0.0036143541801720858 Training loss per 5 training steps: 0.004665722604840994 Training loss per 5 training steps: 0.004188213497400284 Training loss per 5 training steps: 0.009558460675179958 Training loss epoch: 0.009398075751960278 Training accuracy epoch: 0.9947916666666666 Training epoch: 5 Training loss per 5 training steps: 0.0009002589504234493 Training loss per 5 training steps: 0.0013914997689425945 Training loss per 5 training steps: 0.0025324528105556965 Training loss per 5 training steps: 0.001981407403945923 Training loss per 5 training steps: 0.001825391547754407 Training loss per 5 training steps: 0.021362431347370148 Training loss per 5 training steps: 0.018939299508929253 Training loss epoch: 0.01837846450507641 Training accuracy epoch: 0.9955357142857143 Training epoch: 6 Training loss per 5 training steps: 0.005368690937757492 Training loss per 5 training steps: 0.05552434176206589 Training loss per 5 training steps: 0.04340510442852974 Training loss per 5 training steps: 0.046762607991695404 Training loss per 5 training steps: 0.04069221764802933 Training loss per 5 training steps: 0.03334634751081467 Training loss per 5 training steps: 0.028116842731833458 Training loss epoch: 0.027898326516151428 Training accuracy epoch: 0.9902777777777778 Training epoch: 7 Training loss per 5 training steps: 0.0006054409313946962 Training loss per 5 training steps: 0.0017326291417703032 Training loss per 5 training steps: 0.0012943926267325878 Training loss per 5 training steps: 0.001172870397567749 Training loss per 5 training steps: 0.001197592238895595 Training loss per 5 training steps: 0.0010789675870910287 Training loss per 5 training steps: 0.0016145361587405205 Training loss epoch: 0.0016050604172050953 Training accuracy epoch: 1.0 Training epoch: 8 Training loss per 5 training steps: 0.0014251439133659005 Training loss per 5 training steps: 0.0009146408992819488 Training loss per 5 training steps: 0.0008830941515043378 Training loss per 5 training steps: 0.0008495254442095757 Training loss per 5 training steps: 0.0007457883330062032 Training loss per 5 training steps: 0.0017588974442332983 Training loss per 5 training steps: 0.0015469196951016784 Training loss epoch: 0.0015120194293558598 Training accuracy epoch: nan Training epoch: 9 Training loss per 5 training steps: 0.0006757899536751211 Training loss per 5 training steps: 0.0004559016670100391 Training loss per 5 training steps: 0.0005502548301592469 Training loss per 5 training steps: 0.00047555589117109776 Training loss per 5 training steps: 0.00046993012074381113 Training loss per 5 training steps: 0.0004445247177500278 Training loss per 5 training steps: 0.00043491797987371683 Training loss epoch: 0.0004482759104575962 Training accuracy epoch: 1.0 Training epoch: 10 Training loss per 5 training steps: 0.0002681533806025982 Training loss per 5 training steps: 0.00036055134842172265 Training loss per 5 training steps: 0.0003555223229341209 Training loss per 5 training steps: 0.0006156986346468329 Training loss per 5 training steps: 0.0005388441495597363 Training loss per 5 training steps: 0.0005019650561735034 Training loss per 5 training steps: 0.00048019055975601077 Training loss epoch: 0.00050497823394835 Training accuracy epoch: 1.0
def valid(model, testing_loader):
# put model in evaluation mode
model.eval()
eval_loss, eval_accuracy = 0, 0
no_eval_samples, no_steps = 0, 0
eval_preds, eval_labels = [], []
with torch.no_grad():
for i, batch in enumerate(testing_loader):
ids = batch['input_ids'].to(device, dtype= torch.long)
mask = batch['attention_mask'].to(device, dtype= torch.long)
labels = batch['labels'].to(device, dtype= torch.long)
res = model(input_ids = ids, attention_mask= mask, labels = labels)
eval_loss += res[0]
no_steps+=1
no_eval_samples+=labels.size(0)
if i % 5 ==0:
loss_step = eval_loss/no_steps
print(f"validation loss per 5 eval steps: {loss_step}")
flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
active_logits = res[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
# only compute accuracy at active labels
active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
labels = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
eval_labels.extend(labels)
eval_preds.extend(predictions)
tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
if not np.any(labels.cpu().numpy()):
tmp_eval_accuracy = 0
eval_accuracy += tmp_eval_accuracy
labels = [id_to_label[id.item()] for id in eval_labels]
predictions = [id_to_label[id.item()] for id in eval_preds]
eval_loss = eval_loss / no_steps
eval_accuracy = eval_accuracy / no_steps
print(f"Validation Loss: {eval_loss}")
print(f"Validation Accuracy: {eval_accuracy}")
return labels, predictions
labels, predictions = valid(model, testing_loader)
validation loss per 5 eval steps: 1.0310940742492676 validation loss per 5 eval steps: 1.0833351612091064 Validation Loss: 1.07457435131073 Validation Accuracy: 0.3904761904761905
sentence = "1000 dollars is lot of money"
inputs = tokenizer(sentence.strip(),
return_offsets_mapping=True,
padding='max_length',
truncation=True,
max_length=MAX_LEN,
return_tensors="pt")
# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]
active_logits = logits.view(-1, model.num_labels)
model.num_labels
3
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
flattened_predictions
tensor([2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1], device='cuda:0')
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id_to_label[i] for i in flattened_predictions.cpu().numpy()]
preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
preds = [x for x in preds if x[0] not in ['[CLS]','[SEP]','[PAD]']]
print(sentence.split())
print(preds)
['1000', 'dollars', 'is', 'lot', 'of', 'money']
[('1000', 'I-mon'), ('dollars', 'I-mon'), ('is', 'O'), ('lot', 'O'), ('of', 'O'), ('money', 'O')]